/**
 * This program processes an HTML file to generate:
 *
 * 1) The anchors for all HTML header tags (H1, H2, etc.)
 *
 * 2) A 'table of contents' file output using those anchors
 *
 * 3) Anchors for each chapter. (This is a hack because any H4 header is
 * considered to be the beginning of a chapter.)
 */

import std.stdio;
import std.regex;
import std.conv;
import std.string;
import std.array;
import std.exception;
import std.range;
import std.algorithm;
import std.uni;
import std.format;

import alphabet;

/* Represents an entry in the table of contents. */
struct TocEntry
{
    string class_;        /* The div class of the header. */
    size_t level;         /* The level of the entry (H4, H5, etc.) */
    string anchorName;    /* The name of the anchor without leading '#' */
    string value;         /* The actual text of the entry */
}

struct IndexEntry
{
    string content;           /* The actual text of the entry */
    string[] anchorValues;    /* The names of the anchors without leading '#' */
    string[] unsanitizedAnchorValues;
        /* The names of the anchors without leading '#' */
}

version (unittest) {

    void main()
    {}

} else {

void main(string[] args)
{
    const preFileName = args[1];      /* The input file */
    const postFileName = args[2];     /* The output file */
    const tocFileName = args[3];      /* The table of contents file */
    const indexFileName = args[4];    /* The index file */
    const webIndexMacros = args[5];   /* The index body for the web version */
    const alphabetName = args[6];     /* "english", "turkish", etc. */

    writefln("Generating %s and %s from %s",
             tocFileName, postFileName, preFileName);

    auto preFile = File(preFileName, "r");
    auto postFile = File(postFileName, "w");

    /* This table will be populated as each file of the input is processed. */
    TocEntry[] tocEntries;
    IndexEntry[string] indexEntries;

    /* A mapping from an index anchor value to the number of times it has been
     * seen so far. */
    size_t[string] indexAnchors;

    /* This is the name of the chapter anchor, obtained by a hack. */
    string chapterFileName;

    foreach (line; preFile.byLine) {
        /* Every chapter generated by ddoc starts with the following
         * comment. We will scrape the name of the original D file from that
         * line. */
        auto r = regex(`<!-- Chapter source: (ders/.*/(.*))\.d -->`);

        auto c = matchFirst(line, r);

        if (!c.empty) {
            /* Modify the value of the chapter anchor to be used for the rest
             * of the lines of this section of the input. */
            chapterFileName = c[2].to!string;

            postFile.writeln(line);

        } else {
            /* This is an ordinary line. Process it to insert an anchor as
             * necessary, as well as to populate the TOC table. */
            auto linePostTocAndXrefs = processChapterLineTocAndXrefs(
                line, chapterFileName, tocEntries);

            const processedLine = processChapterLineIndexes(
                linePostTocAndXrefs.idup, indexEntries);

            postFile.writeln(processedLine);
        }
    }

    /* XXX - Hack the index section into the toc section as it does not appear
     * in the Makefile.in (partly because we have not generated it yet).
     *
     * The anchor content is replaced with "Dizin" in the pdf.derse_ozel.css
     * file of each Turkish book. */
    tocEntries ~= TocEntry("index", 4, "chapter_index_section", "Index");

    generateToc(tocFileName, tocEntries);

    writefln("Generating %s and %s", indexFileName, webIndexMacros);

    Alphabet alphabet = makeAlphabet(alphabetName);
    generateIndex(indexFileName, webIndexMacros, indexEntries, alphabet);
}

} /* version (unittest) else */

string makeChapterAnchorName(const(char)[] chapterFileName)
{
    return format("chapter_%s", chapterFileName);
}

/* This is the recursive part of TOC generation. Creates a <ul> section for
 * the specified level. We consume tocEntries as we move along. */
void generateToc_R(File toc, size_t level, ref const(TocEntry)[] tocEntries)
{
    bool alreadyPrintedForThisLevel = false;

    toc.writeln(`<ul class="toc">`);
    scope(exit) {
        if (alreadyPrintedForThisLevel) {
            toc.writeln("</li>");
        }
        toc.writeln("</ul>");
    }

    while (!tocEntries.empty) {
        auto entry = tocEntries.front;

        if (entry.level == level) {
            if (alreadyPrintedForThisLevel) {
                toc.writeln("</li>");
            }

            /* This is one of our levels; process it. */
            toc.writeln(format(`<li><a class="%s" href="#%s">%s</a>`,
                               entry.class_.empty ? "toc" : entry.class_,
                               entry.anchorName, entry.value));
            tocEntries.popFront();

            alreadyPrintedForThisLevel = true;

        } else if (entry.level < level) {
            /* We are moving out to the previous level. We are done. */
            break;

        } else if (entry.level > level) {
            if ((entry.level >= 5)
                &&
                [ "index_section", "solution_subsection" ]
                    .canFind(entry.class_)) {
                /* HACK: Do not include any deeper level for "Sözlük",
                 * "Dizin" ("Index"), and "Exercise Solutions" sections. */
                tocEntries.popFront();

            } else {
                /* We are going in to a deeper level; recurse. */
                generateToc_R(toc, entry.level, tocEntries);
            }
        }
    }
}

/* This is the non-recurse entry to the TOC generation. */
void generateToc(string tocFileName, const(TocEntry)[] tocEntries)
{
    auto toc = File(tocFileName, "w");
    generateToc_R(toc, tocEntries[0].level, tocEntries);
}

const indexLinkChar = "⬁";

const indexLinkStrings =
    iota(1, 10)
    .map!(i => format("%s<sup>%s</sup>", indexLinkChar, i.to!string))
    .array;

string anchorFileName(string ix)
{
    enum idxExpr = regex(`ix_(.*?)\.`);

    auto match = matchFirst(ix, idxExpr);
    enforce(!match.empty, format("Unexpected index anchor: %s", ix));

    return match[1];
}

void generateIndex(string indexFileName,
                   string webIndexMacros,
                   const(IndexEntry[string]) indexEntries,
                   Alphabet alphabet)
{
    auto idx = File(indexFileName, "w");
    auto webIdx = File(webIndexMacros, "w");

    idx.writeln(`<div class="index_section">`);
    scope(exit) idx.writeln("</div>");

    idx.writeln(`<ul class="index_section">`);
    scope(exit) idx.writeln("</ul>");

    webIdx.write(`INDEX_ENTRIES=`);

    dchar lastInitial = ' ';

    auto keys = indexEntries.keys;
    auto sortedKeys = sort!((l, r) => indexSectionOrder(l, r, alphabet))(keys);

    foreach (key; sortedKeys) {
        const entry = indexEntries[key];

        if (entry.content.front == 'ı') {
            throw new Exception(
                format("Limitation: Current framework will sort this entry " ~
                       " among the 'i's: %s (grep for $(IX %s)).",
                       entry, entry.content));
        }

        dchar initial = alphabet.toUpper(initialLetter(entry.content));

        if ((initial != lastInitial) && initial.isAlpha) {
            if (((initial == 'I') && (lastInitial == 'İ')) ||
                ((initial == 'İ') && (lastInitial == 'I'))) {

                /* HACK: Ignore this case; we do not distinguish between 'i'
                 * and 'I'. (We assume that all words that start with 'I' are
                 * English and should be listed with the 'i's.) */

            } else {
                idx.writeln("</ul>");
                idx.writefln(`<h5 class="index_section">%s</h5>`, initial);
                idx.writeln(`<ul class="index_section">`);

                webIdx.writefln(`<h5 class="web_index_section">%s</h5>`,
                                initial);

                lastInitial = initial;
            }
        }

        // Book index
        {
            idx.writefln(`<li>%s `, entry.content);
            scope(exit) idx.writeln(`</li>`);

            const indexLinks =
                zip(sequence!"n", entry.anchorValues)
                .map!(t => format(`<a class="index_word" href="#%s">%s</a>`,
                                  t[1],
                                  (entry.anchorValues.length == 1
                                   ? indexLinkChar
                                   : indexLinkStrings[t[0]])))
                .array;

            idx.writefln("%-(&nbsp;&nbsp;%s%)", indexLinks);
        }

        // Web index
        {
            webIdx.writefln(`%s `, entry.content);

            const indexLinks =
                zip(sequence!"n", entry.unsanitizedAnchorValues)
                .map!(t =>
                      format(`<a href="%s.html#%s">%s</a>`,
                             anchorFileName(t[1]),
                             t[1],
                             (entry.unsanitizedAnchorValues.length == 1
                              ? indexLinkChar
                              : indexLinkStrings[t[0]])))
                .array;

            webIdx.writefln("%-(&nbsp;&nbsp;%s%)", indexLinks);
            webIdx.writeln("<br/>");
        }
    }
}

/* This function removes HTML tags altogether and replaces some special
 * characters with underscores in order to make clean anchor names. */
string sanitize(const(char)[] line)
{
    string result;
    bool inside = false;

    foreach (c; line.stride(1)) {
        if (inside) {
            if (c == '>') {
                inside = false;
            }

            /* Ignore everything that's inside an HTML tag. */

        } else {
            switch (c) {
            case '<':
                inside = true;
                break;

            default:
                if (c.isValidXmlNameChar) {
                    result ~= c;

                } else {
                    result ~= encoded(c);
                }

                break;
            }
        }
    }

    return result.strip;
}

struct InclusiveCharRange {
    dchar beg;
    dchar end;

    bool contains(dchar c) const {
        return (c >= beg) && (c <= end);
    }
}

bool isContainedBy(dchar c, const InclusiveCharRange[] ranges) {
    foreach (range; ranges) {
        if (range.contains(c)) {
            return true;
        }
    }

    return  false;
}

bool isValidXmlNameStartChar(dchar c) {
    /* Although Ali does not agree with EPUB validators, we are removing ':'
     * from valid XML name characters.
     *
     *             (c == ':') ||
     * See http://www.w3.org/TR/xml11/#NT-NameStartChar
     */
    static immutable InclusiveCharRange[] ranges = [
        { '_', '_' },
        { 'a', 'z' },
        { 'A', 'Z' },
        { 'a', 'z' },
        { '\xC0', '\xD6' },
        { '\xD8', '\xF6' },
        { '\xF8', '\u02FF' },
        { '\u0370', '\u037D' },
        { '\u037F', '\u1FFF' },
        { '\u200C', '\u200D' },
        { '\u2070', '\u218F' },
        { '\u2C00', '\u2FEF' },
        { '\u3001', '\uD7FF' },
        { '\uF900', '\uFDCF' },
        { '\uFDF0', '\uFFFD' },
        { '\U00010000', '\U000EFFFF' },
    ];

    return c.isContainedBy(ranges);
}

bool isValidXmlNameChar(dchar c) {
    static immutable InclusiveCharRange[] ranges = [
        { '-', '-' },
        { '.', '.' },
        { '0', '9' },
        { '\xB7', '\xB7' },
        { '\u0300', '\u036F' },
        { '\u203F', '\u2040' },
    ];

    return (c.isValidXmlNameStartChar ||
            c.isContainedBy(ranges));
}

static immutable dchar[dchar] encodings;

static this() {
    encodings = [
        '!' : 'X',
        '"' : 'X',
	'“' : 'X',
	'”' : 'X',
	'‘' : 'X',
	'’' : 'X',
        '#' : 'X',
        '$' : 'X',
        '%' : 'X',
        '&' : 'X',
        '\'' : 'X',
        '(' : 'X',
        ')' : 'X',
	'（' : 'X',
	'）' : 'X',
        '*' : 'X',
        '+' : 'X',
        ',' : 'X',
        '/' : 'X',
        ':' : 'X',
        ';' : 'X',
        '=' : 'X',
        '?' : 'X',
        '@' : 'X',
        '[' : 'X',
        '\\' : 'X',
        ']' : 'X',
        '^' : 'X',
        '`' : 'X',
        '{' : 'X',
        '|' : 'X',
        '}' : 'X',
        '~' : 'X',
    ];
}

string encoded(dchar c) {
    if (c == ' ') {
        return "_";
    }

    const encoding = (c in encodings);

    /* If this throws, try enabling other character categories inside
     * isValidXmlNameStartChar() and isValidXmlNameChar(). */
    enforce(encoding, format("No encoding for %c (%x)", c, c));

    return format(".%x.", c);
}

/* Makes an anchor tag from the anchor name and value. */
string makeAnchor(const(char)[] anchor)
{
    return format(`<a id="%s"></a>`, anchor);
}

/* Makes a chapter cross reference. */
char[] makeChapterRef(Captures!(const(char)[]) refMatch)
{
    const chapterFileName = refMatch[2];
    const anchorValue = refMatch[3];

    auto result = format(`<a class="xref" href="#%s">%s</a>`,
                         makeChapterAnchorName(chapterFileName), anchorValue);

    return result.dup;
}

string removeAnchors(const(char)[] line)
{
    auto begResult = line.findSplit("<a ");

    if (begResult[1].empty) {
        return line.dup;

    } else {
        auto endResult = begResult[2].findSplit("</a>");

        return removeAnchors(format("%s%s", begResult[0], endResult[2]));
    }
}

const(char)[] processChapterLineTocAndXrefs(const(char)[] line,
                                            const(char)[] chapterFileName,
                                            ref TocEntry[] tocEntries)
{
    const(char)[] result;

    enum headerExpr = regex(
        `(<[hH]([0-9]) class="(.*?)"[^>]*>)(.*)(</[hH][0-9]>)`);
    enum refExpr = regex(
        `<a href="(/ders/[^/]*/(.*?)\.html)">(.*?)</a>`, "g");
    enum solutionExpr = regex(
        `<a target="ddili_cozum" href="">(<i>.*</i>)</a>`);

    auto headerMatch = matchFirst(line, headerExpr);
    if (!headerMatch.empty) {
        const open = headerMatch[1];
        const level = headerMatch[2];
        const class_ = headerMatch[3];
        const value = headerMatch[4];
        const close = headerMatch[5];

        const headingAnchorName =
            format("%s_%s", chapterFileName, sanitize(line.strip));
        const headingAnchor = makeAnchor(headingAnchorName);

        const chapterAnchorName = makeChapterAnchorName(chapterFileName);
        const chapterAnchor = makeAnchor(chapterAnchorName);

        if ((level == "4") || chapterFileName.endsWith(".cozum")) {
            auto anchoredValue = value.idup;

            const cozumFound = chapterAnchorName.findSplit(".cozum");
            if (!cozumFound[1].empty) {
                /* Add a back reference to the actual chapter of this
                 * solution. */
                anchoredValue = format(`<a class="xref" href="#%s">%s</a>`,
                                       cozumFound[0], anchoredValue);
            }

            /* As a workaround for a prince bug, we don't leave values of
             * chapter anchors empty. Instead, we wrap it around the H4 anchor
             * and text. See:
             *
             *   http://www.princexml.com/forum/topic/1883
             */
            result = format("%s%s%s%s%s", open, chapterAnchor,
                            headingAnchor, anchoredValue, close);


        } else {
            result = format("%s%s%s%s", open, headingAnchor, value, close);
        }

        tocEntries ~= TocEntry(
            class_.to!string, level.to!size_t, headingAnchorName,
            removeAnchors(value.to!string));

    } else if (matchFirst(line, refExpr)) {
        auto sink = appender!(char[])();
        replaceAllInto!makeChapterRef(sink, line, refExpr);
        result = sink.data;

    } else if (matchFirst(line, solutionExpr)) {
        auto sink = appender!(char[])();

        char[] makeSolutionRef(Captures!(const(char)[]) refMatch)
        {
            const anchorValue = refMatch[1];
            auto result =
                format(`<a class="xref" href="#chapter_%s.cozum">%s</a>`,
                       chapterFileName, anchorValue);
            return result.dup;
        }

        replaceAllInto!makeSolutionRef(sink, line, solutionExpr);
        result = sink.data;

    } else {
        result = line;
    }

    return result;
}

const(char)[] processChapterLineIndexes(const(char)[] line,
                                        ref IndexEntry[string] indexEntries)
{
    const(char)[] result;

    enum idxExpr = regex(
        `<a id="(ix_.*?\.(.*?))" content="(.*?)"></a>`);

    if (matchFirst(line, idxExpr)) {
        auto sink = appender!(char[])();

        char[] makeIndexAnchor(Captures!(const(char)[]) match)
        {
            const anchorValue = sanitize(match[1]);
            auto anchorKey = match[2];
            const content = match[3];

            IndexEntry* entry = anchorKey in indexEntries;

            if (entry) {
                enforce(entry.content == content,
                        format("Mismatched index section anchor contents: " ~
                               "'%s' versus '%s' of %s",
                               content, entry.content, *entry));

                entry.anchorValues ~= anchorValue.to!string;
                entry.unsanitizedAnchorValues ~= match[1].to!string;

            } else {
                indexEntries[anchorKey] = IndexEntry(content.to!string,
                                                     [ anchorValue.to!string ],
                                                     [ match[1].to!string ]);
            }

            return format(`<a id="%s"></a>`, anchorValue).dup;
        }

        replaceAllInto!makeIndexAnchor(sink, line, idxExpr);
        result = sink.data;

    } else {
        result = line;
    }

    return result;
}
